Email: sheng.xu@aa.com
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
import warnings
warnings.filterwarnings("ignore")
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
import pandas as pd
from sklearn.model_selection import train_test_split
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from interpret.glassbox import ExplainableBoostingClassifier
df = pd.read_csv('pbta-gene-counts-rsem-expected_count-collapsed.combined.filtered.target.highlow.500.T.zip')
df.shape
(415, 482)
df.columns
Index(['Unnamed: 0', 'MT-CO1', 'GFAP', 'MT-ND4', 'MT-CO3', 'EEF1A1', 'MT-CYB',
'MBP', 'ACTB', 'SPARC',
...
'ACSM2B', 'OR13C2', 'AC142391.1', 'SMIM28', 'AC018554.3', 'KLK8',
'OR6N1', 'FGF6', 'target', 'HighLowGrade'],
dtype='object', length=482)
df.head()
| Unnamed: 0 | MT-CO1 | GFAP | MT-ND4 | MT-CO3 | EEF1A1 | MT-CYB | MBP | ACTB | SPARC | ... | ACSM2B | OR13C2 | AC142391.1 | SMIM28 | AC018554.3 | KLK8 | OR6N1 | FGF6 | target | HighLowGrade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BS_0VXZCRJS | 688399.97 | 768375.44 | 489755.90 | 240854.75 | 484337.54 | 213623.0 | 3068.00 | 286578.16 | 462590.00 | ... | 1.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 1 | BS_3AC3SRWH | 6885942.15 | 695792.99 | 3068246.64 | 2359968.34 | 145654.84 | 1838574.0 | 3673422.60 | 19284.02 | 4628.00 | ... | 0.0 | 0.0 | 2.00 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 |
| 2 | BS_4PPHAQXF | 1042630.21 | 249792.48 | 651345.95 | 390247.42 | 628701.58 | 253190.0 | 1445.64 | 230195.17 | 266144.00 | ... | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 3 | BS_4PWDGEB0 | 914337.23 | 17581.45 | 588803.00 | 349555.99 | 305689.05 | 269897.0 | 809.00 | 217459.08 | 252539.99 | ... | 0.0 | 2.0 | 4.97 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 4 | BS_58YXHGAJ | 617360.80 | 477984.96 | 398712.00 | 248921.83 | 483424.31 | 189343.0 | 105316.27 | 171486.03 | 24650.99 | ... | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 482 columns
train_cols = df.columns[1:3001]
train_cols = df.columns[1:-2]
label = df.columns[-1]
#X = df[train_cols]
y = df[label]
seed = 1
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.20, random_state=seed)
df_test.to_csv("df_test.csv")
df_train.shape
(332, 482)
df_test.shape
(83, 482)
df_train.head()
| Unnamed: 0 | MT-CO1 | GFAP | MT-ND4 | MT-CO3 | EEF1A1 | MT-CYB | MBP | ACTB | SPARC | ... | ACSM2B | OR13C2 | AC142391.1 | SMIM28 | AC018554.3 | KLK8 | OR6N1 | FGF6 | target | HighLowGrade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 159 | BS_D29RPBSZ | 452321.54 | 216161.0 | 199327.42 | 118267.33 | 54390.97 | 126867.0 | 871.00 | 86782.57 | 20818.00 | ... | 1.0 | 0.5 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 95 | BS_5CTVXVRX | 181744.03 | 91.0 | 17946.99 | 135002.63 | 1025556.42 | 16899.0 | 376.62 | 166512.70 | 26949.00 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 11 | BS_PXCPK5XS | 452627.50 | 17821.9 | 189167.00 | 181179.84 | 426570.13 | 140298.0 | 587.40 | 235700.00 | 213354.99 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 374 | BS_YZ2Z1Q6Y | 721220.89 | 377771.0 | 410082.00 | 205727.94 | 84671.03 | 219170.0 | 18584.00 | 46338.61 | 120781.00 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 165 | BS_C7A2TYAC | 524802.48 | 968914.0 | 219724.00 | 179642.96 | 237301.95 | 123222.0 | 43642.00 | 318150.53 | 110694.00 | ... | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 482 columns
df_test.head()
| Unnamed: 0 | MT-CO1 | GFAP | MT-ND4 | MT-CO3 | EEF1A1 | MT-CYB | MBP | ACTB | SPARC | ... | ACSM2B | OR13C2 | AC142391.1 | SMIM28 | AC018554.3 | KLK8 | OR6N1 | FGF6 | target | HighLowGrade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 388 | BS_862NMAR7 | 757880.34 | 1431919.0 | 465653.84 | 304550.91 | 379905.19 | 228292.0 | 44760.00 | 386329.89 | 246661.0 | ... | 0.0 | 6.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 | 1.0 | 1.0 |
| 102 | BS_66HQ3E4Z | 466915.21 | 1208314.0 | 217188.92 | 155404.86 | 228500.94 | 117132.0 | 38553.00 | 305845.75 | 217380.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 187 | BS_EFEZB0ZH | 322791.13 | 1740657.0 | 147939.87 | 121399.91 | 213259.97 | 91998.0 | 7386.00 | 288555.92 | 496903.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 162 | BS_C51RB0YR | 1150642.36 | 222780.0 | 499514.99 | 396129.86 | 296533.34 | 363720.0 | 2165.00 | 410370.93 | 702442.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 90 | BS_52426AMF | 656078.15 | 1845002.0 | 356146.08 | 203549.33 | 251682.86 | 169410.0 | 45588.73 | 286209.12 | 273309.0 | ... | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 482 columns
X_train = df_train[train_cols]
X_test = df_test[train_cols]
X_train.head()
| MT-CO1 | GFAP | MT-ND4 | MT-CO3 | EEF1A1 | MT-CYB | MBP | ACTB | SPARC | VIM | ... | Z84492.1 | IFNW1 | ACSM2B | OR13C2 | AC142391.1 | SMIM28 | AC018554.3 | KLK8 | OR6N1 | FGF6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 159 | 452321.54 | 216161.0 | 199327.42 | 118267.33 | 54390.97 | 126867.0 | 871.00 | 86782.57 | 20818.00 | 105224.00 | ... | 8.71 | 1.0 | 1.0 | 0.5 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 95 | 181744.03 | 91.0 | 17946.99 | 135002.63 | 1025556.42 | 16899.0 | 376.62 | 166512.70 | 26949.00 | 66706.00 | ... | 0.00 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 11 | 452627.50 | 17821.9 | 189167.00 | 181179.84 | 426570.13 | 140298.0 | 587.40 | 235700.00 | 213354.99 | 151174.56 | ... | 0.00 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 374 | 721220.89 | 377771.0 | 410082.00 | 205727.94 | 84671.03 | 219170.0 | 18584.00 | 46338.61 | 120781.00 | 15499.00 | ... | 0.55 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 165 | 524802.48 | 968914.0 | 219724.00 | 179642.96 | 237301.95 | 123222.0 | 43642.00 | 318150.53 | 110694.00 | 142524.00 | ... | 0.00 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 479 columns
X_test.head()
| MT-CO1 | GFAP | MT-ND4 | MT-CO3 | EEF1A1 | MT-CYB | MBP | ACTB | SPARC | VIM | ... | Z84492.1 | IFNW1 | ACSM2B | OR13C2 | AC142391.1 | SMIM28 | AC018554.3 | KLK8 | OR6N1 | FGF6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 388 | 757880.34 | 1431919.0 | 465653.84 | 304550.91 | 379905.19 | 228292.0 | 44760.00 | 386329.89 | 246661.0 | 89902.0 | ... | 8.35 | 0.0 | 0.0 | 6.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 |
| 102 | 466915.21 | 1208314.0 | 217188.92 | 155404.86 | 228500.94 | 117132.0 | 38553.00 | 305845.75 | 217380.0 | 72290.0 | ... | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 187 | 322791.13 | 1740657.0 | 147939.87 | 121399.91 | 213259.97 | 91998.0 | 7386.00 | 288555.92 | 496903.0 | 108977.0 | ... | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 162 | 1150642.36 | 222780.0 | 499514.99 | 396129.86 | 296533.34 | 363720.0 | 2165.00 | 410370.93 | 702442.0 | 296294.0 | ... | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 90 | 656078.15 | 1845002.0 | 356146.08 | 203549.33 | 251682.86 | 169410.0 | 45588.73 | 286209.12 | 273309.0 | 156027.0 | ... | 0.00 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 |
5 rows × 479 columns
y
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
...
410 1.0
411 1.0
412 1.0
413 1.0
414 1.0
Name: HighLowGrade, Length: 415, dtype: float64
#seed = 1
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from interpret.glassbox import LogisticRegression
from interpret import show
ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)
ExplainableBoostingClassifier(feature_names=['MT-CO1', 'GFAP', 'MT-ND4',
'MT-CO3', 'EEF1A1', 'MT-CYB',
'MBP', 'ACTB', 'SPARC', 'VIM',
'PLP1', 'TUBA1A', 'MT-ND3', 'FTH1',
'APOE', 'AQP4', 'BCAN', 'FTL',
'SPP1', 'RPL3', 'TUBA1B', 'SCD',
'EEF1G', 'APP', 'HNRNPA1', 'IGF2',
'CST3', 'RPS6', 'HSP90AA1',
'CHI3L1', ...],
feature_types=['continuous', 'continuous',
'continuous', 'continuous',
'...ous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous',
'continuous', 'continuous', ...],
random_state=1)
ebm_global = ebm.explain_global()
show(ebm_global)